* Set the working directory to the main one of the project

clear
set more off

*** This is the main code that generates the empirical results and the datasets used for the quantitative model of "Technological Waves, Knowledge Diffusion, and Local Growth" (Berkes, Gaetani, Mestieri)

*******************************************************

*** 0 *** - Construct population panel and list of czs

* 1870-1930: Use CPP-based crosswalk:
import delimited dsets\cz_data\population\population_1870_cpp.csv, clear
gen decade = 1870
save dsets\temp_datasets_todelete\temp1.dta, replace

* Take 1880 and 1900 and INTERPOLATE to get 1890:
import delimited dsets\cz_data\population\population_1880_cpp.csv, clear
rename population pop_1880
save dsets\temp_datasets_todelete\temp1_auxaux.dta, replace
import delimited dsets\cz_data\population\population_1900_cpp.csv, clear
rename population pop_1900
merge 1:1 cz using dsets\temp_datasets_todelete\temp1_auxaux.dta
gen ggg = (pop_1900/pop_1880) - 1
sum ggg [weight = pop_1880]
replace pop_1900 = pop_1880*(1+`r(mean)') if _merge == 2
replace pop_1880 = pop_1900/(1+`r(mean)') if _merge == 1
gen population = exp(0.5*(ln(pop_1880)+ln(pop_1900)))
keep cz population
gen decade = 1890
append using dsets\temp_datasets_todelete\temp1.dta
save dsets\temp_datasets_todelete\temp1.dta, replace

forvalues year=1910(20)1930 {
import delimited dsets\cz_data\population\population_`year'_cpp.csv, clear
gen decade = `year'
append using dsets\temp_datasets_todelete\temp1.dta
save dsets\temp_datasets_todelete\temp1.dta, replace
sleep 2000
}

* 1950-2010: Use area-based crosswalk:
forvalues year=1950(20)2010 {
import delimited dsets\cz_data\population\population_`year'_cw.csv, clear
gen decade = `year'
append using dsets\temp_datasets_todelete\temp1.dta
save dsets\temp_datasets_todelete\temp1.dta, replace
sleep 2000
}

drop if population == 0

bys cz: gen tot_decades = _N
keep if tot_decades == 8

keep if decade >= 1890

merge m:1 cz using dsets\cz_data\cz_state_region_division.dta, keep(match) nogen /* Attach state, region, and Census division */

drop if state_id == 2 | state_id == 12 /* Drop Alaska and Hawaii */

bys decade: egen totpop_us = sum(population)
gen pop_share = population / totpop_us
bys cz: egen min_pop_share = min(pop_share) 
gen cz_to_consider = (min_pop_share >= 0.0001) /* At least 0.01% of the US population in each decade */
replace cz_to_consider = 0 if cz == 34801 | cz == 30602 /* These cz have 0 patents in at least one of the periods (footnote 9) */
keep if cz_to_consider == 1
keep cz
duplicates drop

export delimited using model\dsets_for_model\cz_to_consider_cpp.csv, replace novarnames
save dsets\cz_data\cz_to_consider_cpp.dta, replace /* This is the final list of CZs in the dataset */

use dsets\temp_datasets_todelete\temp1.dta, clear
merge m:1 cz using dsets\cz_data\cz_to_consider_cpp.dta, keep(match) nogen
bys decade: egen totpop_us = sum(population)
gen pop_share = population / totpop_us
drop totpop_us

xtset cz decade, delta(20)
gen lagged_population = l.population
gen lagged_population_2 = l2.population
gen pop_growth = ln(population) - ln(lagged_population)
gen lagged_pop_share = l.pop_share

gen log_lagged_population = ln(lagged_population)
gen log_lagged_population_2 = ln(lagged_population_2)

merge 1:1 cz decade using dsets\cz_data\panel_hc_summary_cpp.dta, keep(match master) nogen /* Merge with dataset with human capital harmonized ranking */
rename summary_ranking hc_summary_ranking

merge m:1 cz using dsets\cz_data\cz_state_region_division.dta, keep(match) nogen
egen decade_division_id = group(decade division)

xtset cz decade, delta(20)
gen lagged_hc_summary_ranking = l.hc_summary_ranking

save dsets\cz_data\population\population_panel_cpp.dta, replace /* Final panel with population and human capital */

* Create some dataset FOR MODEL PURPOSES:
keep cz decade population
order decade, after(cz)
sort decade cz
export delimited using model\dsets_for_model\cz_decade_population_cpp.csv, replace novarnames

use dsets\cz_data\population\population_panel_cpp.dta, clear
keep cz region
duplicates drop
sort cz
export delimited using model\dsets_for_model\cz_region_cpp.csv, replace novarnames

use dsets\cz_data\population\population_panel_cpp.dta, clear
keep cz division
duplicates drop
sort cz
export delimited using model\dsets_for_model\cz_division_cpp.csv, replace novarnames

*** 1 *** - Empirical facts of Section 2

** 1a: Construct exposure measures with leave-one-out (loo) method:

use dsets\cz_data\cz_to_consider_cpp.dta, clear
sort cz
gen cz_id = _n
save dsets\cz_data\cz_id_cpp.dta, replace 

use dsets\patents_data\cz_decade_class_patents_cpp.dta, clear

merge m:1 cz using dsets\cz_data\cz_id_cpp.dta, nogen

encode macroipc, gen(macroipc_id)
drop macroipc

save dsets\temp_datasets_todelete\temp4.dta, replace

* To implement the leave-one-out (loo) method, we need to iterate over CZs
sum cz_id
forvalues i=1/`r(max)' {

display `i'

use dsets\temp_datasets_todelete\temp4.dta, clear

keep if cz_id != `i'

bys decade: egen totpat_us_decade = sum(totpat_class_cz_decade)
bys decade macroipc_id: egen totpat_class_us = sum(totpat_class_cz_decade)

gen share_class_us = totpat_class_us / totpat_us_decade

keep decade macroipc_id share_class_us
duplicates drop

xtset macroipc_id decade, delta(20)

gen alpha_hat = ln(share_class_us) - ln(l.share_class_us)

keep decade macroipc_id alpha_hat share_class_us
keep if decade >= 1870

gen cz_id = `i'

if `i' == 1 {
	save dsets\temp_datasets_todelete\alpha_hat_11cl_dec20_loo_cpp.dta, replace
}
else {
	append using dsets\temp_datasets_todelete\alpha_hat_11cl_dec20_loo_cpp.dta
	save dsets\temp_datasets_todelete\alpha_hat_11cl_dec20_loo_cpp.dta, replace
}
}

** 1b: Measure of exposure to the technological wave at the cz level:
use dsets\cz_data\cz_to_consider_cpp.dta, clear
expand = 11
bys cz: gen macroipc_id = _n
expand = 8
bys cz macroipc_id: gen decade = 1870 + (_n-1)*20
sort decade cz macroipc_id
save dsets\temp_datasets_todelete\temp3.dta, replace

use dsets\patents_data\cz_decade_class_patents_cpp.dta, clear
encode macroipc, gen(macroipc_id)
drop macroipc
drop if decade < 1870
merge 1:1 decade cz macroipc_id using dsets\temp_datasets_todelete\temp3.dta, nogen
replace totpat_class_cz_decade = 0 if totpat_class_cz_decade == .

bys decade cz: egen totpat_cz_decade = sum(totpat_class_cz_decade)
gen pie_s_given_n = totpat_class_cz_decade / totpat_cz_decade

* Aggregate shares:
bys decade: egen totpat_decade = sum(totpat_class_cz_decade)
bys decade macroipc_id: egen totpat_class_decade = sum(totpat_class_cz_decade)
gen pie_s = totpat_class_decade / totpat_decade

merge m:1 cz using dsets\cz_data\cz_id_cpp.dta, nogen

save dsets\temp_datasets_todelete\temp5.dta, replace

* Generate a wide dataset with cz decade share
use dsets\temp_datasets_todelete\temp5.dta, clear

keep cz decade macroipc_id pie_s_given_n pie_s
reshape wide pie_s_given_n pie_s, i(cz decade) j(macroipc_id)
save dsets\temp_datasets_todelete\cz_decade_patent_shares_wide_cpp.dta, replace

use dsets\temp_datasets_todelete\temp5.dta, clear

* Adjust the timing: exposure uses patents from t-1:
rename decade decade_starting
gen decade = decade_starting + 20
order decade, after(cz)
merge m:1 decade macroipc_id cz_id using dsets\temp_datasets_todelete\alpha_hat_11cl_dec20_loo_cpp.dta, keep(match) nogen

bys decade_starting cz: egen exposure = sum(pie_s_given_n*alpha_hat)

keep decade decade_starting cz cz_id exposure totpat_cz_decade
duplicates drop

merge 1:1 decade cz using dsets\cz_data\population\population_panel_cpp.dta, keep(match using) nogen 
xtset cz decade, delta(20)

save dsets\temp_datasets_todelete\temp10.dta, replace

* Create the local industry shocks:
do codes\work_with_ipums_industries_cpp.do

use dsets\temp_datasets_todelete\temp10.dta, clear
merge 1:1 cz decade using dsets\cz_data\industry_shocks_cpp.dta, keep(master match) nogen

* Create some useful variable:
gen log_population = ln(population)
gen log_totpat_cz_decade = ln(totpat_cz_decade)

save dsets\cz_data\dataset_reduced_form_cpp.dta, replace /* This is the final dataset used for the empirical analysis */

* Create a datast that can be useful for later (one with all the pairs of cz origin and destination, and one with their distance)

use dsets\cz_data\dataset_reduced_form_cpp.dta, clear
keep cz 
duplicates drop
rename cz cz_destination
sort cz_destination
gen nnn = _n
save dsets\temp_datasets_todelete\cz_destination_nnn.dta, replace
drop nnn
rename cz_destination cz_origin
expand = 485
sort cz_origin
bys cz_origin: gen nnn = _n
merge m:1 nnn using dsets\temp_datasets_todelete\cz_destination_nnn.dta, nogen
drop nnn
save dsets\cz_data\cz_origin_destination_cpp.dta, replace

use dsets\migration\cz_bilateral_distance.dta, clear
merge 1:1 cz_origin cz_destination using dsets\cz_data\cz_origin_destination_cpp.dta, keep(match) nogen
save dsets\migration\cz_bilateral_distance_for_gravity_cpp.dta, replace

** 1c: Generate empirical facts:

use dsets\cz_data\dataset_reduced_form_cpp.dta, clear

*** Appendix Table A.2:
sum population log_population totpat_cz_decade
/*
    Variable |        Obs        Mean    Std. Dev.       Min        Max
-------------+---------------------------------------------------------
  population |      3,880    295188.9    807644.2         12   1.79e+07
log_popula~n |      3,880    11.62713    1.315466   2.484907   16.70017
totpat_cz_~e |      3,395    1104.563    4740.961          0      77956
*/
sum pop_growth exposure local_industry_shock if decade >= 1910
/*
    Variable |        Obs        Mean    Std. Dev.       Min        Max
-------------+---------------------------------------------------------
  pop_growth |      2,910    .2249885    .3106415  -.6844766   2.427836
    exposure |      2,910   -.1073404    .0892518  -.4430177   .4049688
local_indu~k |      2,882   -.1499599    .1567352  -.6813521   .2541895
*/


binscatter pop_growth exposure if decade >= 1910 [weight = lagged_pop_share], n(50) controls(decade_division_id log_lagged_population log_lagged_population_2) ///
 reportreg savedata("model\dsets_for_model\fact_1") replace
/*
------------------------------------------------------------------------------
    __000003 |      Coef.   Std. Err.      t    P>|t|     [95% Conf. Interval]
-------------+----------------------------------------------------------------
    __000002 |   .4640188   .0537254     8.64   0.000      .358675    .5693625
       _cons |   .2794997   .0051927    53.83   0.000     .2693179    .2896815
------------------------------------------------------------------------------
*/

*** Table 1:
reghdfe pop_growth exposure log_lagged_population log_lagged_population_2 if decade >= 1910 [weight = lagged_pop_share], absorb(decade_division_id) cluster(cz decade_division_id)
outreg2 using "tables\fact1.tex", tex replace dec(3) nocons label
reghdfe pop_growth exposure log_lagged_population log_lagged_population_2 l.log_totpat_cz_decade if decade >= 1910 [weight = lagged_pop_share], absorb(decade_division_id) cluster(cz decade_division_id)
outreg2 using "tables\fact1.tex", tex dec(3) nocons label
reghdfe pop_growth exposure log_lagged_population log_lagged_population_2 l.log_totpat_cz_decade local_industry_shock if decade >= 1910 [weight = lagged_pop_share], absorb(decade_division_id) cluster(cz decade_division_id)
outreg2 using "tables\fact1.tex", tex dec(3) nocons label
reghdfe pop_growth exposure log_lagged_population log_lagged_population_2 l.log_totpat_cz_decade local_industry_shock l.hc_summary_ranking if decade >= 1910 [weight = lagged_pop_share], absorb(decade_division_id) cluster(cz decade_division_id)
outreg2 using "tables\fact1.tex", tex dec(3) nocons label

*** Alternative specifications (Appendix Table A.3):
reghdfe pop_growth exposure log_lagged_population log_lagged_population_2 l.log_totpat_cz_decade local_industry_shock l.hc_summary_ranking if decade >= 1910 [weight = lagged_pop_share], absorb(decade_division_id) cluster(cz decade_division_id)
outreg2 using "tables\fact_1_weights_fe.tex", tex replace dec(3) nocons label
reghdfe pop_growth exposure log_lagged_population log_lagged_population_2 l.log_totpat_cz_decade local_industry_shock l.hc_summary_ranking if decade >= 1910 [weight = lagged_pop_share], absorb(cz decade_division_id) cluster(cz decade_division_id)
outreg2 using "tables\fact_1_weights_fe.tex", tex dec(3) nocons label
reghdfe pop_growth exposure log_lagged_population log_lagged_population_2 l.log_totpat_cz_decade local_industry_shock l.hc_summary_ranking if decade >= 1910, absorb(decade_division_id) cluster(cz decade_division_id)
outreg2 using "tables\fact_1_weights_fe.tex", tex dec(3) nocons label
reghdfe pop_growth exposure log_lagged_population log_lagged_population_2 l.log_totpat_cz_decade local_industry_shock l.hc_summary_ranking if decade >= 1910, absorb(cz decade_division_id) cluster(cz decade_division_id)
outreg2 using "tables\fact_1_weights_fe.tex", tex dec(3) nocons label


reghdfe pop_growth log_lagged_population log_lagged_population_2 if decade >= 1910 [weight = lagged_pop_share], absorb(decade_division_id) cluster(cz) resid
predict ppp, res
reghdfe exposure log_lagged_population log_lagged_population_2 if decade >= 1910 [weight = lagged_pop_share], absorb(decade_division_id) cluster(cz) resid
predict eee, res
sum ppp eee [weight = lagged_pop_share]
display .4291818*.0586108/.1658325
* .15168733
drop ppp eee

reghdfe pop_growth log_lagged_population log_lagged_population_2 l.log_totpat_cz_decade l.hc_summary_ranking local_industry_shock if decade >= 1910 [weight = lagged_pop_share], absorb(decade_division_id) cluster(cz) resid
predict ppp, res
reghdfe exposure log_lagged_population log_lagged_population_2 l.log_totpat_cz_decade l.hc_summary_ranking local_industry_shock if decade >= 1910 [weight = lagged_pop_share], absorb(decade_division_id) cluster(cz) resid
predict eee, res
sum ppp eee [weight = lagged_pop_share]
display .3414666*.056937/.1562292
* .1244459
drop ppp eee

******************************************
** Run regression and export data to show variation in exposure and local industrial shocks:
reghdfe exposure log_lagged_population log_lagged_population_2 if decade >= 1910 [weight = lagged_pop_share], absorb(decade_division_id) cluster(cz decade_division_id) resid
predict aaa, res
reghdfe local_industry_shock log_lagged_population log_lagged_population_2 if decade >= 1910 [weight = lagged_pop_share], absorb(decade_division_id) cluster(cz decade_division_id) resid
predict bbb, res
reg aaa bbb [weight = lagged_pop_share] if decade >= 1910
/*
      Source |       SS           df       MS      Number of obs   =     2,882
-------------+----------------------------------   F(1, 2880)      =     23.93
       Model |  .081633979         1  .081633979   Prob > F        =    0.0000
    Residual |  9.82520389     2,880  .003411529   R-squared       =    0.0082
-------------+----------------------------------   Adj R-squared   =    0.0079
       Total |  9.90683787     2,881   .00343868   Root MSE        =    .05841

------------------------------------------------------------------------------
         aaa |      Coef.   Std. Err.      t    P>|t|     [95% Conf. Interval]
-------------+----------------------------------------------------------------
         bbb |   .0706729   .0144475     4.89   0.000     .0423444    .0990013
       _cons |   -.000011    .001088    -0.01   0.992    -.0021443    .0021224
------------------------------------------------------------------------------
*/
corr aaa bbb if decade >= 1910 [weight = lagged_pop_share]
/*
             |      aaa      bbb
-------------+------------------
         aaa |   1.0000
         bbb |   0.0908   1.0000
*/

keep if decade >= 1910
keep cz decade lagged_pop_share aaa bbb
export delimited using "model\dsets_for_model\data_for_scatter_exposure_industry.csv", replace novarnames

********************************************
*** Some facts about the dispersion of patent shares and industry shares:

use dsets\temp_datasets_todelete\cz_decade_patent_shares_wide_cpp.dta, clear
merge 1:1 cz decade using dsets\temp_datasets_todelete\cz_decade_industry_shares_wide_cpp.dta, nogen

gen pie_super_ipc_1 = pie_s_given_n1 + pie_s_given_n2 + pie_s_given_n3
gen pie_super_ipc_2 = pie_s_given_n4 + pie_s_given_n5
gen pie_super_ipc_3 = pie_s_given_n6
gen pie_super_ipc_4 = pie_s_given_n7
gen pie_super_ipc_5 = pie_s_given_n8 + pie_s_given_n9
gen pie_super_ipc_6 = pie_s_given_n10
gen pie_super_ipc_7 = pie_s_given_n11

sum pie_super_ipc_* if decade == 1890, d
** 90-10 ranges:
* Super Class 1 (A)
display .4848485 - .1973684
* .2874801
* Super Class 2 (B)
display .5 - .2352941
* .2647059

sum pie_super_ipc_* if decade == 2010, d
** Largest classes, interquartile ranges:
* Super Class 1 (A)
display .4006654 - .101948
* .2987174
* Super Class 2 (B)
display .374752 - .1014028
* .2733492
* Super Class 6 (G)
display .3035142 - .0424137
* .2611005
* Super Class 7 (H)
display .2252914 - .0125314
* .21276

save dsets\temp_datasets_todelete\temp12.dta, replace

use dsets\temp_datasets_todelete\temp12.dta, clear
keep decade cz pie_super_ipc_* 
reshape long pie_super_ipc_, i(decade cz) j(ipc_id)
gen log_pie = ln(pie_super_ipc_)
reg log_pie decade##ipc_id if decade >= 1890
predict ppp, res
sum ppp
/*
    Variable |        Obs        Mean    Std. Dev.       Min        Max
-------------+---------------------------------------------------------
         ppp |     24,094    .0067131    .6617953  -5.318788   2.979789
*/

* Export data to make histograms for Section 2:
use dsets\temp_datasets_todelete\temp12.dta, clear
keep decade pie_super_ipc_* 
keep if decade == 1890 | decade == 2010
sort decade
export delimited using "model\dsets_for_model\data_for_histograms_sec2.csv", replace novarnames
